In [1]:
%pylab inline
In [2]:
import os
import pickle
import re
import sys
import matplotlib as pl
import matplotlib.pyplot as plt
numpy.random.seed(42)
In [3]:
dataPath = '/Users/omojumiller/mycode/MachineLearningNanoDegree/IntroToMachineLearning/'
sys.path.append(dataPath+'tools/')
sys.path.append(dataPath+'final_project/')
In [4]:
with open(dataPath+'final_project/final_project_dataset.pkl', "r") as data_file:
data_dict = pickle.load(data_file)
In [5]:
from __future__ import division
data_point = data_dict['METTS MARK']
frac = data_point["from_poi_to_this_person"] / data_point["to_messages"]
print frac
In [6]:
def computeFraction( poi_messages, all_messages ):
""" given a number messages to/from POI (numerator)
and number of all messages to/from a person (denominator),
return the fraction of messages to/from that person
that are from/to a POI
"""
### you fill in this code, so that it returns either
### the fraction of all messages to this person that come from POIs
### or
### the fraction of all messages from this person that are sent to POIs
### the same code can be used to compute either quantity
### beware of "NaN" when there is no known email address (and so
### no filled email features), and integer division!
### in case of poi_messages or all_messages having "NaN" value, return 0.
fraction = 0
if poi_messages != 'NaN':
fraction = float(poi_messages) / float(all_messages)
return fraction
In [7]:
submit_dict = {}
for name in data_dict:
data_point = data_dict[name]
from_poi_to_this_person = data_point["from_poi_to_this_person"]
to_messages = data_point["to_messages"]
fraction_from_poi = computeFraction( from_poi_to_this_person, to_messages )
print'{:5}{:35}{:.2f}'.format('FROM ', name, fraction_from_poi)
data_point["fraction_from_poi"] = fraction_from_poi
from_this_person_to_poi = data_point["from_this_person_to_poi"]
from_messages = data_point["from_messages"]
fraction_to_poi = computeFraction( from_this_person_to_poi, from_messages )
#print fraction_to_poi
print'{:5}{:35}{:.2f}'.format('TO: ', name, fraction_to_poi)
submit_dict[name]={"from_poi_to_this_person":fraction_from_poi,
"from_this_person_to_poi":fraction_to_poi}
data_point["fraction_to_poi"] = fraction_to_poi
#####################
def submitDict():
return submit_dict
When Katie was working on the Enron POI identifier, she engineered a feature that identified when a given person was on the same email as a POI. So for example, if Ken Lay and Katie Malone are both recipients of the same email message, then Katie Malone should have her "shared receipt" feature incremented. If she shares lots of emails with POIs, maybe she's a POI herself.
Here's the problem: there was a subtle bug, that Ken Lay's "shared receipt" counter would also be incremented when this happens. And of course, then Ken Lay always shares receipt with a POI, because he is a POI. So the "shared receipt" feature became extremely powerful in finding POIs, because it effectively was encoding the label for each person as a feature.
We found this first by being suspicious of a classifier that was always returning 100% accuracy. Then we removed features one at a time, and found that this feature was driving all the performance. Then, digging back through the feature code, we found the bug outlined above. We changed the code so that a person's "shared receipt" feature was only incremented if there was a different POI who received the email, reran the code, and tried again. The accuracy dropped to a more reasonable level.
We take a couple of lessons from this:
In [16]:
sys.path.append(dataPath+'text_learning/')
words_file = "your_word_data.pkl"
authors_file = "your_email_authors.pkl"
word_data = pickle.load( open(words_file, "r"))
authors = pickle.load( open(authors_file, "r") )
In [17]:
### test_size is the percentage of events assigned to the test set (the
### remainder go into training)
### feature matrices changed to dense representations for compatibility with
### classifier functions in versions 0.15.2 and earlier
from sklearn import cross_validation
features_train, features_test, labels_train, labels_test = cross_validation.train_test_split(word_data,
authors, test_size=0.1, random_state=42)
In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english')
features_train = vectorizer.fit_transform(features_train)
features_test = vectorizer.transform(features_test).toarray()
In [19]:
### a classic way to overfit is to use a small number
### of data points and a large number of features;
### train on only 150 events to put ourselves in this regime
features_train = features_train[:150].toarray()
labels_train = labels_train[:150]
This is an interative process
In [20]:
from sklearn import tree
clf = tree.DecisionTreeClassifier()
clf.fit(features_train, labels_train)
print"{}{:.2f}".format("Classifier accurancy: ", clf.score(features_test, labels_test))
In [25]:
import operator
featuresImportance = clf.feature_importances_
featuresSortedByScore = []
for feature in range(len(featuresImportance)):
if featuresImportance[feature] > 0.2:
featuresSortedByScore.append([feature, featuresImportance[feature]])
df = sorted(featuresSortedByScore, key=operator.itemgetter(1), reverse=True)
for i in range(len(df)):
print "{:5d}: {:f}".format(df[i][0], df[i][1])
In [26]:
for i in range(len(df)):
print vectorizer.get_feature_names()[df[i][0]]
In [ ]: